require(readstata13)
require(data.table)
require(dplyr)
library(tidyr)
library(foreign)
library(RSQLite)
library(stringdist)
library(rgdal)
require(multiwayvcov)

trim <- function (x) gsub("^\\s+|\\s+$", "", x)

rm(list=ls())

#########################################################################################################################
# Aim: Create dataset with the 1851 characteristics of the parish of residence of individuals in 1881

# Datasets used:  
# (1) MolaModified_NAPP_JAG4new.dta: baseline data
# (2) social_points.csv: names of the street points and parish from the historical map
# (3) icem1851.sqlite: 1851 ICeM census
# (4) dictionary_final3.RData: dictionary of occupations categories
# (5) pooleddataNAPPneww50list.RData and Proof2RCoordnew

# Dataset created: movers.csv
#########################################################################################################################

#####################
### 1. NAPP 1881  : Get the addresses, parish, parish of birth to match to the 1851 ICEM ###
#####################

setwd(paste("/Users/",Sys.info()[["user"]],"/Dropbox/NetworkParish2016/Occupational_Choice/Submission/Final_ReStat/Codes/C_ANALYSIS",sep=""))

# Baseline sample
napp = read.dta13(paste("/Users/",Sys.info()[["user"]],"/Dropbox/NetworkParish2016/Occupational_Choice/Submission/Final_ReStat/Codes/NAPP/MolaModified_NAPP_JAG4new.dta",sep=""))

# Sample restrictions: males between 15 and 60 years of age, head of households, non-foreigners and with known occupation
napp = napp[napp$SEX=="Male",]
napp$AGE <- as.numeric(napp$AGE)
napp <- napp[napp$AGE>=15 & napp$AGE<=60, ]
napp <- napp[napp$MARST!="Unknown and NIU",]
napp <- napp[napp$nativity!="Unknown",]
napp <- napp[napp$hhhead==1,]

# Define categories 
napp$cat = ""
napp$cat = ifelse(napp$classnew1==1, "professional", napp$cat)
napp$cat = ifelse(napp$classnew2==1, "domestic", napp$cat)
napp$cat = ifelse(napp$classnew3==1, "commercial", napp$cat)
napp$cat = ifelse(napp$classnew4==1, "artisan", napp$cat)
napp$cat = ifelse(napp$classnew5==1, "builder", napp$cat)
napp$cat = ifelse(napp$classnew6==1, "food", napp$cat)
napp$cat = ifelse(napp$classnew7==1, "service", napp$cat)
napp$cat = ifelse(napp$classnew0==1, "unemployed", napp$cat)

# Define control variables
napp$married <- ifelse(napp$MARST=="Married, spouse present" | napp$MARST=="Married, spouse absent" | napp$MARST=="Widowed", 1, 0)
napp$nchild <- napp$tot_child
napp$nchild <- as.numeric(napp$nchild)
napp$nservant <- napp$SERVANGB
napp$nservant <- as.numeric(napp$nservant)
napp$stayerp <- ifelse(napp$migrant=="Eng - in parish/sub/district of birth", 1, 0)

# Sample restriction
napp <- napp[napp$cat!="unemployed" & napp$cat!="domestic",]

napp <- napp[, c("ROCQUE_ID", "HHNUMhead", "hhhead", "RECIDGB", "social", "civil", "NAMELAST", "NAMEFRST", "AGE", "OCC81GB", "PARIDGB", "parish", "ADDRESS", "name", "name2", "new_address", "new_p_name", "objectid_1", "cat","married",  "nchild","nservant", "stayerp")]

length(unique(napp$parish)) # 170
length(unique(napp$PARIDGB)) # 173
length(unique(napp$social)) # 357
length(unique(napp$civil)) # 56
length(unique(napp$objectid_1)) # 5879
length(unique(napp$ADDRESS)) # 127,642
length(unique(napp$name)) # 169
length(unique(napp$new_address)) # 8,494

# Get the names of social parishes from map
social <- read.csv("social_points.csv")
social <- unique(social[social$P_NAME!="", c("OBJECTID_1", "ROCQUE_ID", "P_NAME", "NAME", "IDSocial")])

length(unique(social$OBJECTID_1)) # 16309
length(unique(social$ROCQUE_ID)) # 5435
length(unique(social$P_NAME)) # 10,100
length(unique(social$NAME)) # 181
length(unique(social$IDSocial)) # 363

# Merge on objectid_1
napp <- merge(napp, social, by.x="objectid_1", by.y="OBJECTID_1", all=FALSE)
napp <- napp[, c("objectid_1", "ROCQUE_ID.x", "HHNUMhead", "hhhead", "RECIDGB", "social", "civil", "NAMELAST", "NAMEFRST", "AGE", "OCC81GB", "PARIDGB", "parish", "ADDRESS", "name", "name2", "new_address", "new_p_name","cat","married",  "nchild","nservant", "stayerp", "P_NAME", "NAME", "IDSocial")]
rm(social)

# Clean the names of parish
napp$PARIDGB <- gsub("ALLHALLOWS LONDON WALL", "ALL HALLOWS LONDON WALL", napp$PARIDGB)
napp$PARIDGB <- gsub("ALLHALLOWS STAINING", "ALL HALLOWS STAINING", napp$PARIDGB)
napp$PARIDGB <- gsub("BROMLEY ST LEONARD", "BROMLEY", napp$PARIDGB)
napp$PARIDGB <- gsub("CHRIST CHURCH", "CHRISTCHURCH SOUTHWARK", napp$PARIDGB)
napp$PARIDGB <- gsub("RATCLIFFE", "RATCLIFF", napp$PARIDGB)
napp$PARIDGB <- gsub("ST ANDREW HOLBORN", "ST ANDREW HOLBORN BELOW THE BARS", napp$PARIDGB)
napp$PARIDGB <- gsub("ST ANDREW HOLBORN", "ST ANDREW HOLBORN (BELOW THE BARS)", napp$PARIDGB)
napp$PARIDGB <- gsub("ST ANDREW HOLBORN", "ST ANDREW HOLBORN ABOVE THE BARS", napp$PARIDGB)
napp$PARIDGB <- gsub("ST ANDREW HOLBORN GEORGE THE MARTYR", "ST ANDREW HOLBORN", napp$PARIDGB)
napp$PARIDGB <- gsub("HOLBORN THE MARTYR", "ST ANDREW HOLBORN", napp$PARIDGB)
napp$PARIDGB <- gsub("ST BOTOLPH ALDGATE", "ALDGATE", napp$PARIDGB)
napp$PARIDGB <- gsub("ST BOTOLPH ALDGATE AKA EAST SMITHFIELD", "ALDGATE", napp$PARIDGB)
napp$PARIDGB <- gsub("ST GILES IN THE FIELDS", "ST GILES FIELDS", napp$PARIDGB)
napp$PARIDGB <- gsub("ST JAMES CLERKENWELL", "CLERKENWELL", napp$PARIDGB)
napp$PARIDGB <- gsub("ST JAMES WESTMINSTER", "WESTMINSTER", napp$PARIDGB)
napp$PARIDGB <- gsub("ST MARY LAMBETH", "LAMBETH", napp$PARIDGB)
napp$PARIDGB <- gsub("ST MARY WHITECHAPEL", "WHITECHAPEL", napp$PARIDGB)
napp$PARIDGB <- gsub("ST MARY MAGDALEN BERMONDSEY", "BERMONDSEY", napp$PARIDGB)
napp$PARIDGB <- gsub("ST BOTOLPH WITHOUT ALDERSGATE", "ST BOTOLPH WITHOUT ALDGATE", napp$PARIDGB)
napp$PARIDGB <- gsub("ST THOMAS SOUTHWARK", "SOUTHWARK", napp$PARIDGB)                
napp$PARIDGB <- gsub("ST GILES WITHOUT CRIPPLEGATE", "ST GILES CRIPPLEGATE", napp$PARIDGB)
napp$PARIDGB <- gsub("ST LEONARD SHOREDITCH", "SHOREDITCH", napp$PARIDGB)
napp$PARIDGB <- gsub("ST LUKE OLD STREET", "ST LUKE", napp$PARIDGB)
napp$PARIDGB <- gsub("ST MARGARET WESTMINSTER", "WESTMINSTER", napp$PARIDGB)
napp$PARIDGB <- gsub("ALDGATE AKA EAST SMITHFIELD", "ALDGATE", napp$PARIDGB)
napp$PARIDGB <- gsub("CHRISTCHURCH SOUTHWARK", "SOUTHWARK", napp$PARIDGB)
napp$PARIDGB <- gsub("ST ANNE LIMEHOUSE", "LIMEHOUSE", napp$PARIDGB)
napp$PARIDGB <- gsub("ST LEONARD SHOREDITCH", "SHOREDITCH", napp$PARIDGB)
napp$PARIDGB <- gsub("ST MARGARET WESTMINSTER", "WESTMINSTER", napp$PARIDGB)
napp$PARIDGB <- gsub("ST MARY PADDINGTON", "PADDINGTON", napp$PARIDGB)
napp$PARIDGB <- gsub("ST MARY ROTHERHITHE", "ROTHERHITHE", napp$PARIDGB)
napp$PARIDGB <- gsub("ST MATTHEW BETHNAL GREEN", "BETHNAL GREEN", napp$PARIDGB)
napp$PARIDGB <- gsub("ST OLAVE SOUTHWARK", "SOUTHWARK", napp$PARIDGB)
napp$PARIDGB <- gsub("ST SAVIOUR SOUTHWARK", "SOUTHWARK", napp$PARIDGB)
napp$PARIDGB <- gsub("ST THOMAS SOUTHWARK", "ST THOMAS", napp$PARIDGB)
napp$PARIDGB <- gsub("ST GILES WITHOUT CRIPPLEGATE", "ST GILES CRIPPLEGATE", napp$PARIDGB)
napp$PARIDGB <- gsub("ST KATHERINE CREECHURCH", "ST KATHERINE CREE", napp$PARIDGB)
napp$PARIDGB <- gsub("ST MARY ALDERMANBURY", "ST MARY ALDERMARY", napp$PARIDGB)
napp$PARIDGB <- gsub("BOW","BOW OR ST MARY STRATFORD LE BOW", napp$PARIDGB)
napp$PARIDGB <- gsub("BROMLEY", "BROMLEY ST LEONARD", napp$PARIDGB)
napp$PARIDGB <- gsub("CHRISTCHURCH SPITAFIELDS", "CHRISTCHURCH SPITALFIELDS", napp$PARIDGB)
napp$PARIDGB <- gsub("FURNIVAL'S INN", "FURNIVALS INN", napp$PARIDGB)
napp$PARIDGB <- gsub("GRAY'S INN", "GRAYS INN", napp$PARIDGB)
napp$PARIDGB <- gsub("LINCOLN'S INN", "LINCOLNS INN", napp$PARIDGB)
napp$PARIDGB <- gsub("ST ANDREW HUBBAR", "ST ANDREW HUBBARD", napp$PARIDGB)
napp$PARIDGB <- gsub("ST ANNE AND ST AGNES", "ST ANN AND ST AGNES ALDERGATE", napp$PARIDGB)
napp$PARIDGB <- gsub("RATCLIFF", "RATCLIFF HAMLET, PART IN LIMEHOUSE", napp$PARIDGB)
napp$PARIDGB <- gsub("RATCLIFF", "RATCLIFF HAMLET, PART IN STEPNEY", napp$PARIDGB)
napp$PARIDGB <- gsub("SARJEANT'S INN", "SERJEANTS INN CHANCERY LANE", napp$PARIDGB)
napp$PARIDGB <- gsub("SARJEANT'S INN", "SERJEANTS INN FLEET STREET", napp$PARIDGB)
napp$PARIDGB <- gsub("ST ALBAN", "ST ALBAN WOOD STREET", napp$PARIDGB)
napp$PARIDGB <- gsub("ST ALPHAGE", "ST ALPHAGE SION COLLEGE", napp$PARIDGB)
napp$PARIDGB <- gsub("ST BATHOLOMEW BY THE EXCHANGE", "ST BARTHOLOMEW BY THE ROYAL EXCHANGE", napp$PARIDGB)
napp$PARIDGB <- gsub("ST BENET GRACECHURCH", "ST BENET GRACECHURCH STREET", napp$PARIDGB)
napp$PARIDGB <- gsub("ST BENET PAUL'S WHARF", "ST BENET PAULS WHARF", napp$PARIDGB)
napp$PARIDGB <- gsub("ST BOTOLPH ALDERSGATE", "ST BOTOLPH WITHOUT ALDERSGATE", napp$PARIDGB)
napp$PARIDGB <- gsub("ST BOTOLPH ALDERSGATE", "ST BOTOLPH WITHOUT ALDGATE", napp$PARIDGB)
napp$PARIDGB <- gsub("ST BOTOLPH BISHOPSGATE", "ST BOTOLPH WITHOUT BISHOPSGATE", napp$PARIDGB)
napp$PARIDGB <- gsub("ST AUGUSTINE", "ST AUGUSTINE WAITING STREET", napp$PARIDGB)
napp$PARIDGB <- gsub("ST CHRISTOPHER LE STOCK", "ST CHRISTOPHER LE STOCKS", napp$PARIDGB)
napp$PARIDGB <- gsub("ST CLEMENT EAST CHEAP", "ST CLEMENT EASTCHEAP", napp$PARIDGB)
napp$PARIDGB <- gsub("ST CLEMENTS DANES", "ST CLEMENT DANES", napp$PARIDGB)
napp$PARIDGB <- gsub("ST DUNSTAIN IN THE WEST", "ST DUNSTAN IN THE WEST", napp$PARIDGB)
napp$PARIDGB <- gsub("ST DUNSTAN STEPNEY, MILE END OLD TOWN", "ST DUNSTAN STEPNEY/MILE END", napp$PARIDGB)
napp$PARIDGB <- gsub("ST GEORGE MIDDLESEX OR ST GEORGE IN THE EAST", "ST GEORGE IN THE EAST", napp$PARIDGB)
napp$PARIDGB <- gsub("ALLHALLOWS LOMBARD STREET", "ALL HALLOWS LOMBARD STREET", napp$PARIDGB)
napp$PARIDGB <- gsub("BARNARD'S INN", "BARNARDS INN", napp$PARIDGB)
napp$PARIDGB <- gsub("OLD TOWER WITHOUT AND TOWER OF LONDON", "OLD TOWER WITHOUT", napp$PARIDGB)
napp$PARIDGB <- gsub("CHRISTCHURCH SPITALFIELDS", "CHRISTCHURCH SPITAFIELDS", napp$PARIDGB)
napp$PARIDGB <- gsub("ST ANN BLACKFRIARS", "ST ANNE BLACKFRIARS", napp$PARIDGB)
napp$PARIDGB <- gsub("ST BRIDE", "ST BRIDE OR ST BRIDGET", napp$PARIDGB)
napp$PARIDGB <- gsub("ST CHRISTOPHER LE STOCKSS", "ST CHRISTOPHER LE STOCK", napp$PARIDGB)
napp$PARIDGB <- gsub("ST CLEMENT EASTCHEAP", "ST CLEMENT EAST CHEAP", napp$PARIDGB)
napp$PARIDGB <- gsub("ST CLEMENT DANES", "ST CLEMENTS DANES", napp$PARIDGB)
napp$PARIDGB <- gsub("ST DUNSTAN IN THE WEST", "ST DUNSTAIN IN THE WEST", napp$PARIDGB)
napp$PARIDGB <- gsub("ST DUNSTAN STEPNEY/MILE END", "ST DUNSTAN STEPNEY, MILE END OLD TOWN", napp$PARIDGB)
napp$PARIDGB <- gsub("ST GILES FIELDS", "ST GILES IN THE FIELDS", napp$PARIDGB)
napp$PARIDGB <- gsub("ST GILES CRIPPLEGATE", "ST GILES WITHOUT CRIPPLEGATE ST BOTOLPH WITHOUT ALDERSGATE", napp$PARIDGB)
napp$PARIDGB <- gsub("ST GREGORY BY ST PAUL'S", "ST GREGORY BY ST PAUL", napp$PARIDGB)
napp$PARIDGB <- gsub("ST HELEN", "ST HELEN BISHOPSGATE", napp$PARIDGB)
napp$PARIDGB <- gsub("ST JAMES GARLICK", "ST JAMES GARLICKHITHE", napp$PARIDGB)
napp$PARIDGB <- gsub("ST JOHN HORSELYDOWN", "ST JOHN HORSLEYDOWN", napp$PARIDGB)
napp$PARIDGB <- gsub("ST JOHN THE BAPTIST", "ST JOHN THE BAPTIST WALBROOK", napp$PARIDGB)
napp$PARIDGB <- gsub("ST KATHERINE COLEMAN STREET", "ST KATHARINE COLEMAN", napp$PARIDGB)
napp$PARIDGB <- gsub("ST KATHERINE CREE", "ST KATHERINE CREE CHURCH", napp$PARIDGB)
napp$PARIDGB <- gsub("ST LAWRENCE JEWRY", "ST LAWERENCE JEWRY", napp$PARIDGB)
napp$PARIDGB <- gsub("ST LAURENCE POUNTNEY", "ST LAWRENCE POUNTNEY", napp$PARIDGB)
napp$PARIDGB <- gsub("ST LEONARD EASTC", "ST LEONARD EAST CHEAP", napp$PARIDGB)
napp$PARIDGB <- gsub("ST MAGNUS THE MARTYR", "ST MAGNUS THE MATYR", napp$PARIDGB)
napp$PARIDGB <- gsub("ST MARGARET FISH", "ST MARGARET NEW FISH STREET", napp$PARIDGB)
napp$PARIDGB <- gsub("ST MARTIN ORGAR", "ST MARTIN ORGARS", napp$PARIDGB)
napp$PARIDGB <- gsub("ST MARTIN POMARY", "ST MARTIN POMROY", napp$PARIDGB)
napp$PARIDGB <- gsub("ST MARY COLECHURH", "ST MARY COLECHURCH", napp$PARIDGB)
napp$PARIDGB <- gsub("ST MARY WOOLCHURCH", "ST MARY WOOLCHURCH HAW", napp$PARIDGB)
napp$PARIDGB <- gsub("ST MICHAEL LE QUERNE", "ST MICHAEL LE QUERN", napp$PARIDGB)
napp$PARIDGB <- gsub("ST NICHOLAS ACON", "ST NICHOLAS ACONS", napp$PARIDGB)
napp$PARIDGB <- gsub("ST OLAVE HART STREET", "ST OLAVE HART STREET, WITH ST NICHOLAS IN THE SHAMBLES", napp$PARIDGB)
napp$PARIDGB <- gsub("ST PETER CORNHIL", "ST PETER CORNHILL", napp$PARIDGB)
napp$PARIDGB <- gsub("ST PETER WESTCHEAP", "ST PETER WEST CHEAP", napp$PARIDGB)
napp$PARIDGB <- gsub("ST STEPHEN WALBR", "ST STEPHEN WALBROOK", napp$PARIDGB)
napp$PARIDGB <- gsub("ST SWITHIN", "ST SWITHIN LONDON STONE", napp$PARIDGB)
napp$PARIDGB <- gsub("ST VEDAST FOSTER", "ST VEDAST FOSTER LANE", napp$PARIDGB)
napp$PARIDGB <- gsub("THE TOWER", "TOWER OF LONDON", napp$PARIDGB)
napp$PARIDGB <- gsub("ST THOMAS THE APOSTLE", "ST THOMAS APOSTLE", napp$PARIDGB)
napp$PARIDGB <- gsub("ST PETER LE POOR", "ST PETER LE POER BROAD STREET", napp$PARIDGB)
napp$PARIDGB <- gsub("ST MILDRED POULTRY", "ST MILDFRED POULTRY", napp$PARIDGB)
napp$PARIDGB <- gsub("ST JAMES DUKE STREET", "ST JAMES DUKES PLACE", napp$PARIDGB)
napp$PARIDGB <- gsub("ST GEORGE IN THE EAST", "ST GEORGE MIDDLESEX OR ST GEORGE IN THE EAST", napp$PARIDGB)
napp$PARIDGB <- gsub("ST ANDREW HOLBORN ABOVE THE BARS (BELOW THE BARS) BELOW THE BARS", "HOLBORN", napp$PARIDGB)
napp$PARIDGB <- gsub("ST ANDREW HOLBORN ABOVE THE BARS (BELOW THE BARS) BELOW THE BARS GEORGE THE MARTYR", "HOLBORN", napp$PARIDGB)
napp$PARIDGB <- gsub("SHOREDITCH", "ST LEONARD SHOREDITCH", napp$PARIDGB)
napp$PARIDGB <- gsub("ST THOMAS", "ST THOMAS SOUTHWARK", napp$PARIDGB)
napp$PARIDGB <- gsub("WHITECHAPEL", "ST MARY WHITECHAPEL", napp$PARIDGB)
napp$PARIDGB <- gsub("RATCLIFF HAMLET, PART IN STEPNEY HAMLET, PART IN LIMEHOUSE", "RATCLIFF", napp$PARIDGB)
napp$PARIDGB <- gsub("ST GILES WITHOUT CRIPPLEGATE ST BOTOLPH WITHOUT ALDERSGATE", "ST GILES WITHOUT CRIPPLEGATE ST BOTOLPH WITHOUT ALDERSGATE", napp$PARIDGB)

# Clean the addresses
napp$new_address <- gsub("[^A-Za-z ]","",napp$new_address)
napp$new_address <- trim(napp$new_address)
napp$new_address <- toupper(napp$new_address)

# Clean the names of points
napp$P_NAME <- gsub("[^A-Za-z ]","",napp$P_NAME)
napp$P_NAME <- trim(napp$P_NAME)
napp$P_NAME <- toupper(napp$P_NAME)

length(unique(napp$objectid_1)) # 5878
length(unique(napp$social)) # 356
length(unique(napp$P_NAME)) # 4545
length(unique(napp$new_address)) # 8487

# Keep the address of the NAPP 
napp_address <- unique(napp[, c("social", "civil", "PARIDGB", "new_address", "P_NAME", "NAME")])

############################################################################################################################

####################
### 2. ICEM 1851 ###
####################

db <- dbConnect(SQLite(), dbname="icem1851.sqlite")
icem <- dbGetQuery(db, "SELECT * FROM census1851 WHERE RC=1")

# sample restriction : males between 15 & 60, head of household with known occupation
icem <- icem[icem$Sex=="M",]
icem <- icem[icem$Age %in% 15:60,]
icem <- icem[!is.na(icem$Occ) & icem$Occ!="" & icem$Occ!=" " & icem$Occ!="?",]
icem <- icem[icem$Rela==10 | icem$Rela==11 | icem$Rela==12,]

icem <- icem[, c("RecID", "Occ", "Occode", "Age", "Offsp", "Mar", "Servts", "ParID", "Parish", "RSD", "RD", "RC", "ConParID", "ParID", "Std_Par", "Cnti", "BpCnty")]

# create control variables
icem$married <- ifelse(icem$Mar==2 | icem$Mar==3, 1, 0)
icem$Age <- as.numeric(icem$Age)
icem$Servts <- as.numeric(icem$Servts)
icem$Offsp <- as.numeric(icem$Offsp)

# Get address and merge based on RecID
icem2 = dbGetQuery(db, "SELECT * FROM census1851b")
icem2 <- icem2[icem2$RecID %in% icem$RecID,]
icem <- merge(icem, icem2, by="RecID", all=TRUE)
rm(icem2)

# Clean addresses
icem$Address <- gsub("[^A-Za-z ]","",icem$Address)
icem$Address <- trim(icem$Address)
icem$Address <- toupper(icem$Address)

# Clean the names of parish 
icem$Parish <- trim(icem$Parish)
icem$Parish <- gsub("LAMBETH, ST JOHN WATERLOO", "LAMBETH", icem$Parish)
icem$Parish <- gsub("LAMBETH, ST LUKE NORWOOD", "LAMBETH", icem$Parish)
icem$Parish <- gsub("LAMBETH, ST MARK KENNINGTON", "LAMBETH", icem$Parish)
icem$Parish <- gsub("LAMBETH, ST MATTHEW BRIXTON", "LAMBETH", icem$Parish)
icem$Parish <- gsub("LAMBETH, THE OLD CHURCH", "LAMBETH", icem$Parish)
icem$Parish <- gsub("BERMONDSEY OLD FISH STREET", "BERMONDSEY", icem$Parish)
icem$Parish <- gsub("ST MATTHEW BETHNAL GREEN, CHRUCH", "BETHNAL GREEN", icem$Parish)
icem$Parish <- gsub("ST MATTHEW BETHNAL GREEN, GREEN", "BETHNAL GREEN", icem$Parish)
icem$Parish <- gsub("ST MATTHEW BETHNAL GREEN, HACKNEY ROAD", "BETHNAL GREEN", icem$Parish)
icem$Parish <- gsub("ST MATTHEW BETHNAL GREEN, TOWN", "BETHNAL GREEN", icem$Parish)
icem$Parish <- gsub("BOW AKA ST MARY STRATFORD LE BOW", "BOW", icem$Parish)
icem$Parish <- gsub("ST JAMES WESTMINSTER","WESTMINSTER",icem$Parish)
icem$Parish <- gsub("ST GEORGE THE MARTYR SOUTHWARK", "SOUTHWARK", icem$Parish)
icem$Parish <- gsub("ST ANDREW HOLBORN ABOVE THE BARS AND ST GEORGE", "HOLBORN", icem$Parish)
icem$Parish <- gsub("LONDON ST MARYLEBONE, LONDON", "ST MARYLEBONE", icem$Parish)
icem$Parish <- gsub("LONDON ST ST PANCRAS, L", "ST PANCRAS", icem$Parish)
icem$Parish <- gsub("ST GILES IN THE FIELDS AND ST GEORGE BLOOMSBURY", "ST GILES FIELDS", icem$Parish)
icem$Parish <- gsub("CHRISTCHURCH SOUTHWARK", "SOUTHWARK", icem$Parish)
icem$Parish <- gsub("ST OLAVE SOUTHWARK", "SOUTHWARK", icem$Parish)
icem$Parish <- gsub("ST SAVIOUR SOUTHWARK", "SOUTHWARK", icem$Parish)
icem$Parish <- gsub("ST MARGARET AND ST JOHN THE EVANGELIST WESTMINSTER", "WESTMINSTER", icem$Parish)
icem$Parish <- gsub("ST MARGARET AND ST JOHN THE EVANGELIST WESTMINSTER", "WESTMINSTER", icem$Parish)
icem$Parish <- gsub("ST SAVIOUR SOUTHWARK", "SOUTHWARK", icem$Parish)
icem$Parish <- gsub("ST ANDREW HOLBORN ABOVE THE BARS AND ST GEORGE THE MARTYR", "HOLBORN", icem$Parish)
icem$Parish <- gsub("ST GEORGE THE MARTYR SOUTHWARK", "SOUTHWARK", icem$Parish)
icem$Parish <- gsub("ST BOTOLPH WITHOUT ALDGATE OR EAST SMITHFIELD", "ALDGATE", icem$Parish)
icem$Parish <- gsub("ST MARY MAGDALEN BERMONDSEY", "BERMONDSEY", icem$Parish)
icem$Parish <- gsub("ST JOHN THE EVANGELIST WESTMINSTER", "WESTMINSTER", icem$Parish)
icem$Parish <- gsub("ST GILES CAMBERWELL", "CAMBERWELL", icem$Parish)
icem$Parish <- gsub("ST GILES CAMBERWELL, ST GEORGE", "CAMBERWELL", icem$Parish)
icem$Parish <- gsub("ST MARY MAGDALEN BERMONDSEY, ST JAMES", "BERMONDSEY", icem$Parish)
icem$Parish <- gsub("ST MARY ROTHERHITHE", "ROTHERHITHE", icem$Parish)
icem$Parish <- gsub("ST MARY ALDERMANBURY", "ST MARY ALDERMARY", icem$Parish)
icem$Parish <- gsub("ST GILES CAMBERWELL, PECKHAM", "CAMBERWELL", icem$Parish)
icem$Parish <- gsub("ST PAUL SHADWELL", "SHADWELL", icem$Parish)
icem$Parish <- gsub("ST JAMES CLERKENWELL", "CLERKENWELL", icem$Parish)
icem$Parish <- gsub("ST GILES WITHOUT CRIPPLEGATE ST BOTOLPH WITHOUT ALDERSGATE", "ST GILES WITHOUT CRIPPLEGATE ST BOTOLPH WITHOUT ALDERSGATE", icem$Parish)
icem$Parish <- gsub("ST BOTOLPH WITHOUT ALDGATE", "ALDGATE", icem$Parish)
icem$Parish <- gsub("ST MARGARET WESTMINSTER (INCLUDING THE PRIVY GARDENS AND WHITEHALL)", "WESTMINSTER", icem$Parish)
icem$Parish <- gsub("ST ANNE LIMEHOUSE (ALL EXCEPT THE PART OF THE HAMLET OF RATCLIFF THEREIN)", "LIMEHOUSE", icem$Parish)
icem$Parish <- gsub("ST BOTOLPH WITHOUT ALDGATE", "ST GILES WITHOUT CRIPPLEGATE ST BOTOLPH WITHOUT ALDERSGATE", icem$Parish)
icem$Parish <- gsub("WHITEFRIARS", "PRECINCT OF WHITEFRIARS", icem$Parish)
icem$Parish <- gsub("ST MARY NEWINGTON, HOLY TRINITY", "NEWINGTON", icem$Parish)
icem$Parish <- gsub("THE ROLLS", "LIBERTY OF THE ROLLS", icem$Parish)
icem$Parish <- gsub("NORTON FOLGATE", "LIBERTY OF NORTON FOLGATE", icem$Parish)
icem$Parish <- gsub("ST LEONARD SHOREDITCH, HOLWELL AND MOORFIELDS", "ST LEONARD SHOREDITCH", icem$Parish)
icem$Parish <- gsub("ST MRY LE BOW", "ST MARY LE BOW OR ST MARY STRATFORD LE BOW", icem$Parish)
icem$Parish <- gsub("ST THOMAS APOSTLE", "ST THOMAS SOUTHWARK APOSTLE", icem$Parish)
icem$Parish <- gsub("BRIDEWELL PRECINCTS", "PRECINCT OF BRIDEWELLL", icem$Parish)
icem$Parish <- gsub("ALL SAINTS POPLAR", "POPLAR", icem$Parish)
icem$Parish <- gsub("GLASSHOUSE YARD", "LIBERTY OF GLASSHOUSE YARD", icem$Parish)
icem$Parish <- gsub("SAFFRON HILL, HATTON GARDEN, ELY RENTS, AND ELY PLACE", "LIBERTY OF SAFFRON HILLD ELY PLACE", icem$Parish)
icem$Parish <- gsub("ST ANDREW HOLBORN ABOVE THE BARS", "ST ANDREW HOLBORN ABOVE THE BARS (BELOW THE BARS) BELOW THE BARS GEORGE THE MARTYR", icem$Parish)
icem$Parish <- gsub("ST ANDREW HOLBORN (BELOW THE BARS)", "ST ANDREW HOLBORN ABOVE THE BARS (BELOW THE BARS) BELOW THE BARS", icem$Parish)
icem$Parish <- gsub("ST GABRIEL FENCHURCH STREET", "STREET ST GABRIEL FENCHURCH STREET", icem$Parish)
icem$Parish <- gsub("ST GILES WITHOUT CRIPPLEGATE ST BOTOLPH WITHOUT ALDERSGATE", "ST GILES WITHOUT CRIPPLEGATE ST BOTOLPH WITHOUT ALDERSGATE", icem$Parish)
icem$Parish <- gsub("SOUTHWARK", "ST THOMAS SOUTHWARK", icem$Parish)
icem$Parish <- gsub("ST MARY STROKE NEWINGTON", "NEWINGTON", icem$Parish)
icem$Parish <- gsub("BERMONDSEY, ST JAMES", "BERMONDSEY", icem$Parish)
icem$Parish <- gsub("CAMBERWELL, ST GEORGE", "CAMBERWELL", icem$Parish)
icem$Parish <- gsub("CAMBERWELL, PECKHAM", "CAMBERWELL", icem$Parish)
icem$Parish <- gsub("RATCLIFF HAMLET, PART IN STEPNEY", "RATCLIFF", icem$Parish)
icem$Parish <- gsub("ST GILES WITHOUT CRIPPLEGATE ST BOTOLPH WITHOUT ALDERSGATE", "ST GILES WITHOUT CRIPPLEGATE ST BOTOLPH WITHOUT ALDERSGATE", icem$Parish)
icem$Parish <- gsub("ST MARY NEWINGTON, ST MARY", "NEWINGTON", icem$Parish)
icem$Parish <- gsub("ST THOMAS SOUTHWARK", "SOUTHWARK", icem$Parish)
icem$Parish <- gsub("ST ANNE LIMEHOUSE (ALL EXCEPT THE PART OF THE HAMLET OF RATCLIFF THEREIN)", "LIMEHOUSE", icem$Parish)
icem$Parish <- gsub("BERMONDSEY, ST JAMES", "BERMONDSEY", icem$Parish)
icem$Parish <- gsub("CAMBERWELL, PECKHAM", "CAMBERWELL", icem$Parish)
icem$Parish <- gsub("ST MARY NEWINGTON, ST PETER WALWORTH", "NEWINGTON", icem$Parish)
icem$Parish <- gsub("ST MARGARET WESTMINSTER (INCLUDING THE PRIVY GAR)", "WESTMINSTER", icem$Parish)
icem$Parish <- gsub("ST THOMAS ST THOMAS SOUTHWARK APOSTLE", "ST THOMAS SOUTHWARK APOSTLE", icem$Parish)
icem$Parish <- gsub("CAMBERWELL, DULWICH", "CAMBERWEL", icem$Parish)
icem$Parish <- gsub("CHRISTCHURCH SPITAFIELDS", "SPITAFIELDS", icem$Parish)
icem$Parish <- gsub("BOW OR ST MARY STRATFORD LE BOW", "BOW", icem$Parish)
icem$Parish <- gsub("ELTHAM, MOTTINHAM", "ELTHAM", icem$Parish)
icem$Parish <- gsub("RATCLIFF HAMLET, PART IN LIMEHOUSE", "RATCLIFF", icem$Parish)
icem$Parish <- gsub("SERJEANTS INN CHANCERY LANE", "SERJEANTS INN FLEET STREET", icem$Parish)
icem$Parish <- gsub("ST ANDREW HOLBORN (BELOW THE BARS)", "HOLBORN", icem$Parish)
icem$Parish <- gsub("ST ANDREW HOLBORN ABOVE THE BARS (BELOW THE BARS) BELOW THE BARS GEORGE THE MARTYR", "HOLBORN", icem$Parish)
icem$Parish <- gsub("ST ANNE LIMEHOUSE (ALL EXCEPT THE PART OF THE HAMLET OF RATCLIFF THEREIN)", "LIMEHOUSE", icem$Parish)
icem$Parish <- gsub("ST BRIDE OR ST BRIDGET", "ST BRIDE", icem$Parish)
icem$Parish <- gsub("ST GEORGE THE MATYR", "ST GEORGE THE MARTYR", icem$Parish)
icem$Parish <- gsub("ST THOMAS SOUTHWARK APOSTLE", "ST THOMAS SOUTHWARK", icem$Parish)
icem$Parish <- gsub("STREET ST GABRIEL FENCHURCH STREET", "FENCHURCH STREET", icem$Parish)
icem$Parish <- gsub("ST MAGNUS THE MATYR", "ST MAGNUS THE MARTYR", icem$Parish)
icem$Parish <- gsub("ST THOMAS SOUTHWARK APOSTLE", "ST THOMAS SOUTHWARK", icem$Parish)

length(unique(icem$Address)) # 23355
length(unique(icem$Parish)) # 206

# Occupation category of individual
load(file = "dictionary_final3.RData")
dictionary <- dictionary[dictionary$cat!="domestic" & !is.na(dictionary$cat) & dictionary$cat!="", c("Occ", "cat")]
icem <- merge(icem, dictionary, by="Occ", all.x=TRUE)
icem <- icem[!is.na(icem$cat),]
icem <- icem[icem$cat!="",]

# Clean the county of birth 
icem$BpCnty <- gsub("PANCRAS", "ST PANCRAS", icem$BpCnty)
icem$BpCnty <- gsub("BETHNAL G", "BETHNAL GREEN", icem$BpCnty)
icem$BpCnty <- gsub("MARYLEBONE", "ST MARYLEBONE", icem$BpCnty)
icem$BpCnty <- gsub("ROTHERHE", "ROTHERHITHE", icem$BpCnty)
icem$BpCnty <- gsub("LONDON BETHNAL GREEN", "BETHNAL GREEN", icem$BpCnty)
icem$BpCnty <- gsub("B GREEN", "BETHNAL GREEN", icem$BpCnty)
icem$BpCnty <- gsub("BETH GREEN", "BETHNAL GREEN", icem$BpCnty)
icem$BpCnty <- gsub("SOUTHW ST OLAVE", "ST OLAVE SOUTHWARK", icem$BpCnty)
icem$BpCnty <- gsub("ST ST PANCRAS", "ST PANCRAS", icem$BpCnty)
icem$BpCnty <- gsub("BETHNAL GREENREEN", "BETHNAL GREEN", icem$BpCnty)
icem$BpCnty <- gsub("BETHNAL GREENN", "BETHNAL GREEN", icem$BpCnty)
icem$BpCnty <- gsub("ST LEONARDS SHOREDIT", "SHOREDITCH", icem$BpCnty)
icem$BpCnty <- gsub("ST LUKES", "ST LUKE", icem$BpCnty)
icem$BpCnty <- gsub("ST GEORGES EAST", "ST GEORGE IN THE EAST", icem$BpCnty)
icem$BpCnty <- gsub("ST ST MARYLEBONE", "ST MARYLEBONE", icem$BpCnty)
icem$BpCnty <- gsub("BETHNAL GREENR","BETHNAL GREEN", icem$BpCnty)
icem$BpCnty <- gsub("ST GEORGES EAST", "ST GEORGE IN THE EAST", icem$BpCnty)
icem$BpCnty <- gsub("SOUTHK", "ST SAVIOUR SOUTHWARK", icem$BpCnty)
icem$BpCnty <- gsub("STHWARK", "ST SAVIOUR SOUTHWARK", icem$BpCnty)
icem$BpCnty <- gsub("WESTMINSTER ST JAMES", "ST JAMES WESTMINSTER", icem$BpCnty)
icem$BpCnty <- gsub("BMDSY", "BERMONDSEY", icem$BpCnty)
icem$BpCnty <- gsub("BERSY", "BERMONDSEY", icem$BpCnty)
icem$BpCnty <- gsub("ST LEONARDS SHOREDIT", "SHOREDITCH", icem$BpCnty)
icem$BpCnty <- gsub("MILE END", "MILE END OLD TOWN", icem$BpCnty)
icem$BpCnty <- gsub("B'SEA", "BATTERSEA", icem$BpCnty)
icem$BpCnty <- gsub("MILE END", "MILE END OLD TOWN",icem$BpCnty)
icem$BpCnty <- gsub("LONDON ISLINGTON, LO", "ISLINGTON",icem$BpCnty)
icem$BpCnty <- gsub("SHOREH", "SHOREDITCH", icem$BpCnty)
icem$BpCnty <- gsub("ST GEO HAN SQ", "ST GEORGE HANOVER SQUARE", icem$BpCnty)
icem$BpCnty <- gsub("SOUTHWARK", "ST SAVIOUR SOUTHWARK", icem$BpCnty)
icem$BpCnty <- gsub("BERMY", "BERMONDSEY", icem$BpCnty)
icem$BpCnty <- gsub("ST GEORGE HAN SQ", "ST GEORGE HANOVER SQUARE", icem$BpCnty)
icem$BpCnty <- gsub("CLERKLL", "CLERKENWELL", icem$BpCnty)
icem$BpCnty <- gsub("WESTMR", "WESTMINSTER", icem$BpCnty)
icem$BpCnty <- gsub("BETH GN", "BETHNAL GREEN", icem$BpCnty)
icem$BpCnty <- gsub("CHRIST CHURCH", "SOUTHWARK", icem$BpCnty)
icem$BpCnty <- gsub("ST. ST PANCRAS", "ST PANCRAS", icem$BpCnty)
icem$BpCnty <- gsub("WESTMINSTER ST JAMES", "ST JAMES WESTMINSTER", icem$BpCnty)
icem$BpCnty <- gsub("BETHNAL GREENR","BETHNAL GREEN", icem$BpCnty)
icem$BpCnty <- gsub("LAMBH", "LAMBETH", icem$BpCnty)
icem$BpCnty <- gsub("M E O TOWN", "MILE END OLD TOWN", icem$BpCnty)
icem$BpCnty <- gsub("M E O T", "MILE END OLD TOWN", icem$BpCnty)
icem$BpCnty <- gsub("WESTR","WESTMINSTER", icem$BpCnty)
icem$BpCnty <- gsub("LONDON CLERKL, LONDO", "CLERKENWELL", icem$BpCnty)
icem$BpCnty <- gsub("CLERKLL", "CLERKENWELL", icem$BpCnty)
icem$BpCnty <- gsub("LAMBETT", "LAMBETH", icem$BpCnty)
icem$BpCnty <- gsub("BATTORSEA", "BATTERSEA", icem$BpCnty)
icem$BpCnty <- gsub("BETH GRN", "BETHNAL GREEN", icem$BpCnty)
icem$BpCnty <- gsub("LONDON LAMBETH, LOND", "LAMBETH", icem$BpCnty)
icem$BpCnty <- gsub("MILE END N T", "MILE END NEW TOWN", icem$BpCnty)
icem$BpCnty <- gsub("ST GEORGE E", "ST GEORGE IN THE EAST", icem$BpCnty)
icem$BpCnty <- gsub("CLERKEN WELL", "CLERKENWELL", icem$BpCnty)
icem$BpCnty <- gsub("WCHAPEL", "WHITECHAPEL", icem$BpCnty)
icem$BpCnty <- gsub("ST GEORGES EAST", "ST GEORGE IN THE EAST", icem$BpCnty)
icem$BpCnty <- gsub("STHWARK", "SOUTHWARK", icem$BpCnty)
icem$BpCnty <- gsub("SHORDITCH","SHOREDITCH", icem$BpCnty)
icem$BpCnty <- gsub("OLD ST ST LUKE", "ST LUKE", icem$BpCnty)
icem$BpCnty <- gsub("ST MARGT WESTMR", "WESTMINSTER", icem$BpCnty)
icem$BpCnty <- gsub("SGHOREDITCH","SHOREDITCH", icem$BpCnty)
icem$BpCnty <- gsub("CAMBLL", "CAMBERWELL", icem$BpCnty)
icem$BpCnty <- gsub("LONDON ISLINGTON, LO", "ISLINGTON", icem$BpCnty)
icem$BpCnty <- gsub("CLERKLL", "CLERKENWELL", icem$BpCnty)
icem$BpCnty <- gsub("ST.LUKES", "ST LUKE", icem$BpCnty)
icem$BpCnty <- gsub("LONDON LAMBETH, LOND", "LAMBETH", icem$BpCnty)
icem$BpCnty <- gsub("LONDON ST ST PANCRAS, L", "ST PANCRAS", icem$BpCnty)
icem$BpCnty <- gsub("ROTHRHITHE", "ROTHERHITHE", icem$BpCnty)
icem$BpCnty <- gsub("ST MARTINS", "ST MARTIN IN THE FIELDS", icem$BpCnty)
icem$BpCnty <- gsub("B G", "BETHNAL GREEN", icem$BpCnty)
icem$BpCnty <- gsub("BETH GRN", "BETHNAL GREEN", icem$BpCnty)
icem$BpCnty <- gsub("ST.ST PANCRAS","ST PANCRAS", icem$BpCnty)
icem$BpCnty <- gsub("BERMONSEY", "BERMONDSEY", icem$BpCnty)
icem$BpCnty <- gsub("B GN", "BETHNAL GREEN", icem$BpCnty)
icem$BpCnty <- trim(icem$BpCnty)

# Clean the parish of birth
icem$Std_Par <- toupper(icem$Std_Par)
icem$Std_Par <- gsub("ALLHALLOWS", "ALL HALLOWS", icem$Std_Par)
icem$Std_Par <- gsub("ALLHALLOWS BARKING", "ALL HALLOWS BARKING", icem$Std_Par)
icem$Std_Par <- gsub("ALLHALLOWS BREAD STREET", "ALL HALLOWS BREAD STREET", icem$Std_Par)
icem$Std_Par <- gsub("ALLHALLOWS HONEY LANE", "ALL HALLOWS HONEY LANE", icem$Std_Par)
icem$Std_Par <- gsub("ALLHALLOWS LOMBARD STREET", "ALL HALLOWS LOMBARD STREET", icem$Std_Par)
icem$Std_Par <- gsub("ALLHALLOWS LONDON WALL", "ALL HALLOWS LONDON WALL", icem$Std_Par)
icem$Std_Par <- gsub("ALLHALLOWS STAINING", "ALL HALLOWS STAINING", icem$Std_Par)
icem$Std_Par <- gsub("BOW AKA ST MARY STRATFORD LE BOW", "BOW", icem$Std_Par)
icem$Std_Par <- gsub("BROMLEY", "BROMLEY ST LEONARD", icem$Std_Par)
icem$Std_Par <- gsub("BUCKINGHAM", "BUCKINGHAM PALACE", icem$Std_Par)
icem$Std_Par <- gsub("CHRISTCHURCH SOUTHWARK", "SOUTHWARK", icem$Std_Par)
icem$Std_Par <- gsub("CLIFFORD", "CLIFFORDS INN", icem$Std_Par)
icem$Std_Par <- gsub("GRAYS", "GRAYS INN", icem$Std_Par)
icem$Std_Par <- gsub("HOLYWELL", "HOLYWELL WARD", icem$Std_Par)
icem$Std_Par <- gsub("ST ANDREW HOLBORN ABOVE THE BARS AND ST GEORGE THE", "HOLBORN", icem$Std_Par)
icem$Std_Par <- gsub("ST ANDREW HOLBORN ABOVE THE BARS AND ST GEORGE THE MARTYR", "HOLBORN", icem$Std_Par)
icem$Std_Par <- gsub("ST ANDREW HOLBORN BELOW THE BARS", "HOLBORN", icem$Std_Par)
icem$Std_Par <- gsub("ST ANN AND ST AGNES ALDERSGATE", "ST ANN AND ST AGNES ALDERGATE", icem$Std_Par)
icem$Std_Par <- gsub("ST ANN BLACKFRIARS", "ST ANNE BLACKFRIARS", icem$Std_Par)
icem$Std_Par <- gsub("ST AUGUSTINE WATLING STREET", "ST AUGUSTINE WAITING STREET", icem$Std_Par)
icem$Std_Par <- gsub("ST CLEMENT EASTCHEAP", "ST CLEMENT EAST CHEAP", icem$Std_Par)
icem$Std_Par <- gsub("ST CLEMENT DANES", "ST CLEMENTS DANES", icem$Std_Par)
icem$Std_Par <- gsub("ST DUNSTAN IN THE WEST", "ST DUNSTAIN IN THE WEST", icem$Std_Par)
icem$Std_Par <- gsub("ST GEORGE THE MARTYR SOUTHWARK", "ST GEORGE THE MARTYR", icem$Std_Par)
icem$Std_Par <- gsub("ST GEORGE THE MTYR SOUTHWARK","ST GEORGE THE MARTYR", icem$Std_Par)
icem$Std_Par <- gsub("ST GILES IN THE FIELDS AND ST GEORGE BLOOMSBURY", "ST GILES IN THE FIELDS", icem$Std_Par)
icem$Std_Par <- gsub("ST GILES WITHOUT CRIPPLEGATE", "ST GILES WITHOUT CRIPPLEGATE ST BOTOLPH WITHOUT ALDERSGATE", icem$Std_Par)
icem$Std_Par <- gsub("ST JOHN THE BAPTIST", "ST JOHN THE BAPTIST WALBROOK", icem$Std_Par)
icem$Std_Par <- gsub("ST KATHERINE CREE", "ST KATHERINE CREE CHURCH", icem$Std_Par)
icem$Std_Par <- gsub("ST LAWRENCE JEWRY", "ST LAWERENCE JEWRY", icem$Std_Par)
icem$Std_Par <- gsub("ST LEONARDS", "ST LEONARD SHOREDITCH", icem$Std_Par)
icem$Std_Par <- gsub("ST MARGARET AND ST JOHN THE EVANGELIST WESTMINSTER", "ST MARGARET WESTMINSTER (INCLUDING THE PRIVY GARDENS AND WHITEHALL)", icem$Std_Par)
icem$Std_Par <- gsub("ST OLAVE HART STREET WITH ST NICHOLAS IN THE SHAMBLES", "ST OLAVE HART STREET, WITH ST NICHOLAS IN THE SHAMBLES", icem$Std_Par)
icem$Std_Par <- gsub("ST PETER", "ST PETER CORNHILL", icem$Std_Par)
icem$Std_Par <- gsub("ST PETER LE POER", "ST PETER LE POER BROAD STREET", icem$Std_Par)
icem$Std_Par <- gsub("ST THOMAS THE APOSTLE", "ST THOMAS SOUTHWARK", icem$Std_Par)
icem$Std_Par <- gsub("ST THOMAS APOSTLE", "ST THOMAS SOUTHWARK", icem$Std_Par)
icem$Std_Par <- gsub("STRENSHAM", "STREATHAM", icem$Std_Par)

# Create variable mover/stayer in parish of birth 
icem$stayerp = ifelse(icem$BpCnty==icem$Parish | icem$Std_Par==icem$Parish, 1, 0)
icem$moverp = 1 - icem$stayerp

rm(dictionary)

############################################################################################################################

#####################################################################
### 3. Match NAPP ADDRESS to 1851 ICEM based on social + address  ###
### = ICEM with variables social, civil and P_NAME ###
#####################################################################

# merge on parish and street
data <- merge(napp_address, icem, by.x=c("PARIDGB", "P_NAME"), by.y=c("Parish", "Address"), all=FALSE)
length(unique(data$social)) # 322
length(unique(data$PARIDGB)) # 136
length(unique(data$P_NAME)) # 2238
length(unique(data$RecID)) # 92150

icem2 <- icem[!(icem$RecID %in% data$RecID),]
data2 <- merge(napp_address, icem2, by.x=c("PARIDGB", "new_address"), by.y=c("Parish", "Address"), all=FALSE)
length(unique(data2$social)) # 143
length(unique(data2$PARIDGB)) # 61
length(unique(data2$P_NAME)) # 265
length(unique(data2$RecID)) # 4027

data = rbind(data, data2)
data = unique(data[, -c(5,6)])
rm(data2, icem2)

length(unique(data$social)) # 325
length(unique(data$PARIDGB)) # 136
length(unique(data$P_NAME)) # 2390
length(unique(data$RecID)) # 96,177

# merge based on address only
icem2 <- icem[!(icem$RecID %in% data$RecID),]
data2 <- merge(napp_address, icem2, by.x="P_NAME", by.y="Address", all=FALSE)

icem2 <- icem[!(icem$RecID %in% data$RecID),]
icem2 <- icem2[!(icem2$RecID %in% data2$RecID),]
data3 <- merge(napp_address, icem2, by.x="new_address", by.y="Address", all=FALSE)

data2 = rbind(data2, data3)
data2 = unique(data2[, -c(5,6)])
rm(data3, icem2)

length(unique(data2$social)) # 337
length(unique(data2$PARIDGB)) # 147
length(unique(data2$P_NAME)) # 1281
length(unique(data2$RecID)) # 66372

data <- plyr::rbind.fill(data, data2)
rm(data2)

# Deal with duplicates            
data$x=1
data <- data.table(data)
data <- data[, xxx:=sum(x), by="RecID"]
data <- data.frame(data)
data = data[data$xxx==1,-c(17,30,31,32)]
rm(napp_address)

length(unique(data$social)) # 342
length(unique(data$PARIDGB)) # 153
length(unique(data$P_NAME)) # 2747
length(unique(data$RecID)) # 119,468

############################################################################################################################

############################################################################
### 4. Create share by occupation at the social level for 1851 ICEM data ###
############################################################################

data$professional = ifelse(data$cat=="professional", 1, 0)
data$artisan = ifelse(data$cat=="artisan", 1, 0)
data$builder = ifelse(data$cat=="builder", 1, 0)
data$commercial = ifelse(data$cat=="commercial", 1, 0)
data$food = ifelse(data$cat=="food", 1, 0)
data$service = ifelse(data$cat=="service", 1, 0)

# Share and Modal occupation by parish-address in 1851
data <- data %>%
  dplyr::group_by(social, P_NAME) %>%
  dplyr::mutate(total = n())

data <- data %>%
  dplyr::group_by(cat, social, P_NAME) %>%
  dplyr::mutate(total_c = n())

data <- data.table(data)
data[ , sh_professional := (professional*total_c)/ total, by=c("social", "P_NAME")]
data[ , sh_professional := max(sh_professional), by=c("social", "P_NAME")]
data[ , sh_artisan := (artisan*total_c)/ total, by=c("social", "P_NAME")]
data[ , sh_artisan := max(sh_artisan), by=c("social", "P_NAME")]
data[ , sh_builder := (builder*total_c)/ total, by=c("social", "P_NAME")]
data[ , sh_builder := max(sh_builder), by=c("social", "P_NAME")]
data[ , sh_commercial := (commercial*total_c)/ total, by=c("social", "P_NAME")]
data[ , sh_commercial := max(sh_commercial), by=c("social", "P_NAME")]
data[ , sh_food := (food*total_c)/ total, by=c("social", "P_NAME")]
data[ , sh_food := max(sh_food), by=c("social", "P_NAME")]
data[ , sh_service := (service*total_c)/ total, by=c("social", "P_NAME")]
data[ , sh_service := max(sh_service), by=c("social", "P_NAME")]
data[ , avg_servant_1851 := mean(as.numeric(Servts), na.rm=TRUE), by = c("social", "P_NAME")]
data[ , avg_age_1851 := mean(as.numeric(Age), na.rm=TRUE), by = c("social", "P_NAME")]
data[ , avg_married_1851 := mean(married, na.rm=TRUE), by = c("social", "P_NAME")]
data[ , avg_stayerp_1851 := mean(stayerp, na.rm=TRUE), by = c("social", "P_NAME")]
data[ , avg_child_1851 := mean(Offsp, na.rm=TRUE), by = c("social", "P_NAME")]
data <- data.frame(data)

data <- unique(data[, c("P_NAME", "PARIDGB", "social", "civil", "ConParID", "sh_professional", "sh_artisan", "sh_builder", "sh_commercial", "sh_food", "sh_service", "avg_servant_1851", "avg_age_1851", "avg_married_1851", "avg_stayerp_1851", "avg_child_1851")])

############################################################################################################################

####################################################################
### 5. Merge NAPP 1881 to ICEM based on civil, social and p_name ###
####################################################################

length(unique(napp$RECIDGB)) # 217915

# Merge with NAPP to get all individual characteristics from 1881
all <- merge(napp, data, by=c("civil", "social", "P_NAME"), all=FALSE)
all$moverp <- 1 - all$stayerp
all = unique(all[ ,-c(27,28)])

nrow(all) # 135924
length(unique(all$HHNUMhead)) # 135924
length(unique(all$RECIDGB)) # 134953

# Create dummy for the baseline sample

root <- paste("/Users/",Sys.info()[["user"]],"/Dropbox/NetworkParish2016/Occupational_Choice/Submission/Final_ReStat/Codes/",sep="")
wddata<-c(paste(root,"Results/ReStat/Placebo/Coords/",sep=""),paste(root,"NAPP/Proof2RCoordnew.dta",sep="")) 
NAPP<-read.dta(wddata[2],convert.factors=FALSE)
load(file=paste(root,"NAPP/ReStat/pooleddataNAPPneww50list.RData",sep="" ))
HHstudied <- unique(pooleddata$ID)
NAPP <- subset(NAPP,(HHNUMhead %in% HHstudied))
rm(pooleddata)
all$sample <- ifelse(all$HHNUMhead %in% NAPP$HHNUMhead,1, 0)

# save  
write.csv(all, "movers.csv")
